library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
library(leaflet) 
## Warning: package 'leaflet' was built under R version 4.3.2
library(htmlwidgets) 
library(sf) 
## Warning: package 'sf' was built under R version 4.3.3
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(ggmap) 
## ℹ Google's Terms of Service: <https://mapsplatform.google.com>
##   Stadia Maps' Terms of Service: <https://stadiamaps.com/terms-of-service/>
##   OpenStreetMap's Tile Usage Policy: <https://operations.osmfoundation.org/policies/tiles/>
## ℹ Please cite ggmap if you use it! Use `citation("ggmap")` for details.
library(units)
## udunits database from /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/library/units/share/udunits/udunits2.xml
library(dplyr)

Part A. 1.

data.dir = gsub('/code', '/data', getwd())
neighborhoods= st_read(paste0(data.dir, '/hw1-data/Toronto_Neighbourhoods/Toronto_Neighbourhoods.shp'))
## Reading layer `Toronto_Neighbourhoods' from data source 
##   `/Users/davegong/Desktop/year4 fall/sta465/HW1/hw1-data/Toronto_Neighbourhoods/Toronto_Neighbourhoods.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 140 features and 39 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -79.63926 ymin: 43.581 xmax: -79.11524 ymax: 43.85546
## Geodetic CRS:  WGS 84
subway_lines= st_read(paste0(data.dir, '/hw1-data/ttc-subway-shapefile-wgs84/TTC_SUBWAY_LINES_WGS84.shp'))
## Reading layer `TTC_SUBWAY_LINES_WGS84' from data source 
##   `/Users/davegong/Desktop/year4 fall/sta465/HW1/hw1-data/ttc-subway-shapefile-wgs84/TTC_SUBWAY_LINES_WGS84.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 4 features and 3 fields
## Geometry type: LINESTRING
## Dimension:     XY
## Bounding box:  xmin: -79.5354 ymin: 43.63781 xmax: -79.25107 ymax: 43.79677
## Geodetic CRS:  WGS 84
subway_lines_wgs84 <- st_transform(subway_lines, crs = 4326)
neighborhoods_wgs84 <- st_transform(neighborhoods, crs = 4326)


crime_sites = read.csv(paste0(data.dir, "/hw1-data/toronto-crimes-06-2024.csv")) %>% 
  mutate(REPORT_DATE = as.Date(REPORT_DATE))

Write-up: Toronto_Neighbourhoods.shp is Polygons/Multipolygons TTC_SUBWAY_LINES_WGS84.shp is Polylines toronto-crimes-06-2024.csv is points/multipoints

.shp: main shapefile .shx: spatial index .dbf: dBase file which stores the non-spatial columns .prj: definition of the projection of the spatial data

head(neighborhoods)
## Simple feature collection with 6 features and 39 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -79.52344 ymin: 43.62049 xmax: -79.28461 ymax: 43.79607
## Geodetic CRS:  WGS 84
##   OBJECTID              Neighbourh Total_Area Total_Popu Pop_Males Pop_Female
## 1        1          Yonge-St.Clair        1.2      11655      5235       6420
## 2        2 York University Heights       13.2      27715     13580      14125
## 3        3        Lansing-Westgate        5.4      14640      6950       7695
## 4        4      Yorkdale-Glen Park        5.9      14685      6750       7940
## 5        5     Stonegate-Queensway        7.9      24690     11935      12745
## 6        6  Tam O'Shanter-Sullivan        5.5      27390     12840      14555
##   Pop0_4year Pop5_9year Pop10_14ye Pop15_19ye Pop20_24ye Pop25_29ye Pop30_34ye
## 1        395        345        235        285        645       1355       1125
## 2       1645       1385       1380       1750       3280       2815       2120
## 3        845        725        745        795        965       1185       1290
## 4        640        690        815        835        845        890        895
## 5       1395       1235       1265       1295       1190       1270       1575
## 6       1460       1380       1370       1560       1765       1665       1605
##   Pop35_39ye Pop40_44ye Pop45_49ye Pop50_54ye Pop55_59ye Pop60_64ye Pop65_69ye
## 1        900        860        755        715        760        820        755
## 2       1800       1820       1940       1750       1335       1150        870
## 3       1170       1175       1150       1035        855        750        515
## 4        860       1050       1085       1030        850        765        610
## 5       1865       1990       2190       2120       1790       1515        940
## 6       1850       1965       2230       1925       1655       1435       1235
##   Pop70_74ye Pop75_79ye Pop80_84ye Pop85years Seniors55a Seniors65a Child0_14
## 1        525        445        365        395       4075       2505       985
## 2        890        825        545        395       6010       3545      4405
## 3        405        360        330        360       3565       1960      2300
## 4        690        755        690        700       5070       3450      2140
## 5        830        755        635        825       7295       3985      3895
## 6       1220       1190        945        970       8595       5520      4170
##   Youth15_24 HomeLangua Language_C Language_I Language_K Language_P Language_1
## 1        925      11445        250        120         55        120         55
## 2       5040      26340       1750       2280        265        370        285
## 3       1755      14285        995        165        580        630         90
## 4       1690      13685        640       2950         50         70        810
## 5       2465      23980        330        755        135         65        555
## 6       3305      26195       8220        340        170        420         70
##   Language_R Language_S Language_T Language_2 Language_U
## 1        180        230         95         10         10
## 2        430       1475        930       1175        580
## 3        770        320        460         15         45
## 4         85        625        605        120         60
## 5        635        455        260         25         35
## 6         65        315        820       1150        780
##                         geometry
## 1 POLYGON ((-79.39119 43.6810...
## 2 POLYGON ((-79.50529 43.7598...
## 3 POLYGON ((-79.43998 43.7615...
## 4 POLYGON ((-79.43969 43.7056...
## 5 POLYGON ((-79.49262 43.6474...
## 6 POLYGON ((-79.31979 43.7683...
head(subway_lines)
## Simple feature collection with 4 features and 3 fields
## Geometry type: LINESTRING
## Dimension:     XY
## Bounding box:  xmin: -79.5354 ymin: 43.63781 xmax: -79.25107 ymax: 43.79677
## Geodetic CRS:  WGS 84
##   OBJECTID                ROUTE_NAME RID                       geometry
## 1    53420 LINE 1 (YONGE-UNIVERSITY)   1 LINESTRING (-79.52813 43.79...
## 2    53421 LINE 2 (BLOOR - DANFORTH)   2 LINESTRING (-79.5354 43.637...
## 3    53422      LINE 3 (SCARBOROUGH)   3 LINESTRING (-79.26332 43.73...
## 4    53423         LINE 4 (SHEPPARD)   4 LINESTRING (-79.41113 43.76...
head(crime_sites)
##          X       Y OBJECTID EVENT_UNIQUE_ID REPORT_DATE               OCC_DATE
## 1 -8855488 5416241   392761  GO-20241185211  2024-06-01 2024/06/01 05:00:00+00
## 2 -8840321 5412794   392762  GO-20241184252  2024-06-01 2024/06/01 05:00:00+00
## 3 -8829780 5425476   392763  GO-20241184623  2024-06-01 2024/06/01 05:00:00+00
## 4 -8851608 5406477   392764  GO-20241184531  2024-06-01 2024/06/01 05:00:00+00
## 5 -8851608 5406477   392765  GO-20241184531  2024-06-01 2024/06/01 05:00:00+00
## 6 -8819720 5439058   392766  GO-20241191208  2024-06-01 2024/06/01 05:00:00+00
##   REPORT_YEAR REPORT_MONTH REPORT_DAY REPORT_DOY REPORT_DOW REPORT_HOUR
## 1        2024         June          1        153 Saturday             7
## 2        2024         June          1        153 Saturday             2
## 3        2024         June          1        153 Saturday             3
## 4        2024         June          1        153 Saturday             3
## 5        2024         June          1        153 Saturday             3
## 6        2024         June          1        153 Saturday            23
##   OCC_YEAR OCC_MONTH OCC_DAY OCC_DOY    OCC_DOW OCC_HOUR DIVISION
## 1     2024      June       1     153 Saturday          1      D23
## 2     2024      June       1     153 Saturday          2      D14
## 3     2024      June       1     153 Saturday          3      D33
## 4     2024      June       1     153 Saturday          3      D22
## 5     2024      June       1     153 Saturday          3      D22
## 6     2024      June       1     153 Saturday         23      D42
##                                                            LOCATION_TYPE
## 1                    Single Home, House (Attach Garage, Cottage, Mobile)
## 2                                  Schools During Un-Supervised Activity
## 3 Other Commercial / Corporate Places (For Profit, Warehouse, Corp. Bldg
## 4                    Dealership (Car, Motorcycle, Marine, Trailer, Etc.)
## 5                    Dealership (Car, Motorcycle, Marine, Trailer, Etc.)
## 6                    Single Home, House (Attach Garage, Cottage, Mobile)
##   PREMISES_TYPE UCR_CODE UCR_EXT                OFFENCE    MCI_CATEGORY
## 1         House     2135     210 Theft Of Motor Vehicle      Auto Theft
## 2   Educational     1610     200      Robbery - Mugging         Robbery
## 3    Commercial     2120     200                    B&E Break and Enter
## 4    Commercial     2120     200                    B&E Break and Enter
## 5    Commercial     2135     210 Theft Of Motor Vehicle      Auto Theft
## 6         House     2120     200                    B&E Break and Enter
##   HOOD_158                    NEIGHBOURHOOD_158 HOOD_140
## 1      007 Willowridge-Martingrove-Richview (7)      007
## 2      080         Palmerston-Little Italy (80)      080
## 3      043                Victoria Village (43)      043
## 4      160               Mimico-Queensway (160)      017
## 5      160               Mimico-Queensway (160)      017
## 6      144            Morningside Heights (144)      131
##                          NEIGHBOURHOOD_140 LONG_WGS84 LAT_WGS84
## 1     Willowridge-Martingrove-Richview (7)  -79.55021  43.68121
## 2             Palmerston-Little Italy (80)  -79.41395  43.65881
## 3                    Victoria Village (43)  -79.31926  43.74118
## 4 Mimico (includes Humber Bay Shores) (17)  -79.51535  43.61775
## 5 Mimico (includes Humber Bay Shores) (17)  -79.51535  43.61775
## 6                              Rouge (131)  -79.22889  43.82926
#transform csv into shapefile
crime_sites = st_as_sf(crime_sites, 
               coords=c('LONG_WGS84','LAT_WGS84'),
               crs=4326, # projection
               remove=TRUE) # remove coordinate columns
crime_sites_wgs84 <- st_transform(crime_sites, crs = 4326)
leaflet() %>%
  # Add CartoDB.Positron layer
  addProviderTiles('CartoDB.Positron') %>%
  
  addCircles(data = crime_sites, 
             color = 'firebrick', 
             fillOpacity = 1, 
             opacity = 1, 
             radius = 25,
             label = ~MCI_CATEGORY,  # Assuming you have a column for crime descriptions/labels
             popup = ~UCR_CODE,  # Popup for more details
             group = "Crime Sites") %>%
  
  # Add subway lines 
  addPolylines(data = subway_lines,
               color = ~case_when(  # Conditional color based on route name
                 ROUTE_NAME == "LINE 1 (YONGE-UNIVERSITY)" ~ "pink",
                 ROUTE_NAME == "LINE 2 (BLOOR - DANFORTH)" ~ "green",
                 ROUTE_NAME == "LINE 3 (SCARBOROUGH)" ~ "blue",
                 ROUTE_NAME == "LINE 4 (SHEPPARD)" ~ "yellow",
                 TRUE ~ "black"),  # Default color
               label = ~ROUTE_NAME,  # Label the subway routes
               popup = ~ROUTE_NAME,
               group = "Subway Lines") %>%
  
  # Add neighborhood polygons 
  addPolygons(data = neighborhoods,
              fillColor = "gray",   # Add a default color for neighborhoods
              fillOpacity = 0.5,
              color = "black",
              weight = 1,
              label = ~Neighbourh,  
              popup = ~Neighbourh,
              group = "Neighborhoods") %>%
  
  
  addLayersControl(
    overlayGroups = c("Crime Sites", "Subway Lines", "Neighborhoods"),
    options = layersControlOptions(collapsed = FALSE)
  )

Part A. 2.

st_crs(crime_sites)
## Coordinate Reference System:
##   User input: EPSG:4326 
##   wkt:
## GEOGCRS["WGS 84",
##     ENSEMBLE["World Geodetic System 1984 ensemble",
##         MEMBER["World Geodetic System 1984 (Transit)"],
##         MEMBER["World Geodetic System 1984 (G730)"],
##         MEMBER["World Geodetic System 1984 (G873)"],
##         MEMBER["World Geodetic System 1984 (G1150)"],
##         MEMBER["World Geodetic System 1984 (G1674)"],
##         MEMBER["World Geodetic System 1984 (G1762)"],
##         MEMBER["World Geodetic System 1984 (G2139)"],
##         ELLIPSOID["WGS 84",6378137,298.257223563,
##             LENGTHUNIT["metre",1]],
##         ENSEMBLEACCURACY[2.0]],
##     PRIMEM["Greenwich",0,
##         ANGLEUNIT["degree",0.0174532925199433]],
##     CS[ellipsoidal,2],
##         AXIS["geodetic latitude (Lat)",north,
##             ORDER[1],
##             ANGLEUNIT["degree",0.0174532925199433]],
##         AXIS["geodetic longitude (Lon)",east,
##             ORDER[2],
##             ANGLEUNIT["degree",0.0174532925199433]],
##     USAGE[
##         SCOPE["Horizontal component of 3D system."],
##         AREA["World."],
##         BBOX[-90,-180,90,180]],
##     ID["EPSG",4326]]
st_crs(subway_lines)
## Coordinate Reference System:
##   User input: WGS 84 
##   wkt:
## GEOGCRS["WGS 84",
##     DATUM["World Geodetic System 1984",
##         ELLIPSOID["WGS 84",6378137,298.257223563,
##             LENGTHUNIT["metre",1]]],
##     PRIMEM["Greenwich",0,
##         ANGLEUNIT["degree",0.0174532925199433]],
##     CS[ellipsoidal,2],
##         AXIS["latitude",north,
##             ORDER[1],
##             ANGLEUNIT["degree",0.0174532925199433]],
##         AXIS["longitude",east,
##             ORDER[2],
##             ANGLEUNIT["degree",0.0174532925199433]],
##     ID["EPSG",4326]]
st_crs(neighborhoods)
## Coordinate Reference System:
##   User input: WGS 84 
##   wkt:
## GEOGCRS["WGS 84",
##     DATUM["World Geodetic System 1984",
##         ELLIPSOID["WGS 84",6378137,298.257223563,
##             LENGTHUNIT["metre",1]]],
##     PRIMEM["Greenwich",0,
##         ANGLEUNIT["degree",0.0174532925199433]],
##     CS[ellipsoidal,2],
##         AXIS["latitude",north,
##             ORDER[1],
##             ANGLEUNIT["degree",0.0174532925199433]],
##         AXIS["longitude",east,
##             ORDER[2],
##             ANGLEUNIT["degree",0.0174532925199433]],
##     ID["EPSG",4326]]

Write-Up: (1) They all have datum as WGS 84. WGS 84 uses an ellipsoid (a mathematically defined, flattened sphere) to approximate the shape of the Earth. The specific ellipsoid used is called the WGS 84 ellipsoid. It uses an origin at the center of the Earth for defining locations. WGS 84 uses a geographic coordinate system based on latitude, longitude, and altitude. These coordinates are expressed in degrees for latitude and longitude, and meters for altitude. (2) The datum defines an origin point of the coordinate axes and the direction of the axes. (3) Yes, there is a need to apply a single datum type to all 3 datasets, so that the the type of ellipsoid model, Reference Frame and coordinate System are the same.

Part A.3 Projection affects spatial operations such as calculating distance and area. Operations between spatial objects also require them to be in the same projection. And UTM projection minimizes distortion over smaller areas.

crime_sites <- st_transform(crime_sites, crs = 32617)
subway_lines <- st_transform(subway_lines, crs = 32617)
neighborhoods <- st_transform(neighborhoods, crs = 32617)

Part A.4

utm17.area = neighborhoods %>% 
  st_area(geometry) %>% set_units(km^2)

tibble(
  neighborhood = neighborhoods$Neighbourh,   
  area_km2 = as.numeric(utm17.area),       
  population = neighborhoods$Total_Popu
)
## # A tibble: 140 × 3
##    neighborhood                 area_km2 population
##    <chr>                           <dbl>      <int>
##  1 Yonge-St.Clair                   1.16      11655
##  2 York University Heights         13.2       27715
##  3 Lansing-Westgate                 5.35      14640
##  4 Yorkdale-Glen Park               6.04      14685
##  5 Stonegate-Queensway              7.95      24690
##  6 Tam O'Shanter-Sullivan           5.42      27390
##  7 The Beaches                      3.60      21135
##  8 Thistletown-Beaumond Heights     3.34      10140
##  9 Thorncliffe Park                 3.13      19225
## 10 Danforth-East York               2.19      16710
## # ℹ 130 more rows
plot(data.frame(utm17.area, neighborhoods$Total_Popu))

cor(as.numeric(utm17.area), as.numeric(neighborhoods$Total_Popu), use = "complete.obs")
## [1] 0.6205686

Write-Up: From the scatter plot, we can see there is a positive correlation (0.6205686) between the area and population. This means population will increase as area increases.

Part A.5

#spatial join of crimes to neighborhoods
neighborhood_crimes <- st_join(neighborhoods, crime_sites, join = st_intersects)
neighborhood_crimes <- st_transform(neighborhood_crimes, crs = 4326)

# Summarize the number of crimes per Neighbourh
crime.neighor = neighborhood_crimes %>% as_tibble() %>% 
  group_by(Neighbourh) %>%
  summarise(total_crimes=n())

population = neighborhoods$Total_Popu  # Sum of population columns (adjust names as needed)
crime_rate_per_capita = crime.neighor$total_crimes / population

crime_summary <- tibble(crime.neighor,population=population,crime_rate_per_capita=crime_rate_per_capita )

Merged_Neighbor_crimes = merge(neighborhoods_wgs84, crime.neighor, by='Neighbourh')
Merged_Neighbor_crimes = merge(Merged_Neighbor_crimes, crime_summary, by='Neighbourh')
#5(a):

crime.pal = colorNumeric(palette = "YlOrRd", 
                         domain=Merged_Neighbor_crimes$total_crimes.x)
 

leaflet(Merged_Neighbor_crimes) %>% 
  addProviderTiles('CartoDB.Positron') %>% 
  addPolygons(weight=0, fillOpacity=.8, color=~crime.pal(Merged_Neighbor_crimes$total_crimes.x),
              label=~paste0(Neighbourh, ": ", total_crimes.x)) %>% 
  addLegend(pal = crime.pal,
            values = Merged_Neighbor_crimes$total_crimes.x,
            title = "Total Crimes",
            position = "bottomright")
#5(b):
crime_rate.pal = colorNumeric(palette = "YlOrRd", 
                         domain=Merged_Neighbor_crimes$crime_rate_per_capita)

leaflet(Merged_Neighbor_crimes) %>% 
  addProviderTiles('CartoDB.Positron') %>% 
  addPolygons(weight=0, fillOpacity=.8, color=~crime_rate.pal(Merged_Neighbor_crimes$crime_rate_per_capita),
              label=~paste0(Merged_Neighbor_crimes$Neighbourh, ": ", Merged_Neighbor_crimes$crime_rate_per_capita)) %>% 
  addLegend(pal = crime_rate.pal, 
            values = Merged_Neighbor_crimes$crime_rate_per_capita,
            title = "Total Crimes",
            position = "bottomright")

From the map showing the counts of crimes in each neighborhood, we can see that Waterfront Communities-The Island have the most number of crimes. And the neighborhood around these this neighborhoods have more crimes compared to others. Also, we can see there are more crimes on the west than the east.

From the map showing the per capita crime rate, we can see that West Humber-clairville have the highest rates. And the neighborhood around these this neighborhoods have higher crimes rates compared to others. Similarly, we can see there are higher crime rate on the west than the east.

Part A.6

subway_buffer <- st_buffer(subway_lines, dist = 1000)
subway_buffer_wgs84 <- st_transform(subway_buffer, crs = 4326)

#Create the leaflet map with the CartoDB tiles
leaflet() %>%
  addProviderTiles('CartoDB.Positron') %>%
  

  addPolylines(data = subway_lines_wgs84, color = "blue", weight = 2, opacity = 0.8) %>%
  
  
  addPolygons(data = subway_buffer_wgs84, fillColor = "red", fillOpacity = 0.3, color = "red", weight = 1) %>%
  
  addCircles(data = crime_sites_wgs84, color = 'black', fillOpacity = 0.1, radius = 25) %>%
  
  addLegend("bottomright", colors = "red", labels = "1 km Buffer", title = "Legend")
crimes_in_buffer <- st_join(subway_buffer_wgs84, crime_sites_wgs84, join = st_intersects)



crime_summary_table <- crimes_in_buffer %>%
  as_tibble() %>% 
  group_by(ROUTE_NAME) %>%  
  summarise(total_crimes_in_buffer = n()) %>%  
  mutate(proportion_of_crimes_occured_within_buffer = total_crimes_in_buffer / nrow(crime_sites))  

crime_summary_table
## # A tibble: 4 × 3
##   ROUTE_NAME                total_crimes_in_buffer proportion_of_crimes_occure…¹
##   <chr>                                      <int>                         <dbl>
## 1 LINE 1 (YONGE-UNIVERSITY)                   1006                        0.256 
## 2 LINE 2 (BLOOR - DANFORTH)                    683                        0.174 
## 3 LINE 3 (SCARBOROUGH)                         111                        0.0283
## 4 LINE 4 (SHEPPARD)                             79                        0.0201
## # ℹ abbreviated name: ¹​proportion_of_crimes_occured_within_buffer

We can see the proportion of the crimes occurred within 1 km subway line 1 is nearly 25.6%, and the proportion of the crimes occurred within 1 km subway line 1 is nearly 17.4%.

Part A.7

#(a)The following neighborhoods has subway
neighborhoods_subway_or_not = st_join(neighborhoods, subway_lines, join = st_intersects)
neighborhoods_subway_or_not <- neighborhoods_subway_or_not %>% distinct(Neighbourh, .keep_all = TRUE)

neighborhoods_subway= filter(neighborhoods_subway_or_not, ROUTE_NAME!='NA')
neighborhoods_without_subway= filter(neighborhoods_subway_or_not, is.na(ROUTE_NAME))
print(neighborhoods_subway$Neighbourh)
##  [1] "Yonge-St.Clair"                    "York University Heights"          
##  [3] "Lansing-Westgate"                  "Yorkdale-Glen Park"               
##  [5] "Danforth-East York"                "Humewood-Cedarvale"               
##  [7] "Islington-City Centre West"        "Danforth"                         
##  [9] "St.Andrew-Windfields"              "Taylor-Massey"                    
## [11] "Church-Yonge Corridor"             "Clairlea-Birchmount"              
## [13] "Ionview"                           "North Riverdale"                  
## [15] "Forest Hill North"                 "Waterfront Communities-The Island"
## [17] "Kennedy Park"                      "Clanton Park"                     
## [19] "Casa Loma"                         "Kensington-Chinatown"             
## [21] "Kingsway South"                    "Runnymede-Bloor West Village"     
## [23] "Forest Hill South"                 "Henry Farm"                       
## [25] "Annex"                             "University"                       
## [27] "Dorset Park"                       "Dovercourt-Wallace Emerson-Juncti"
## [29] "Newtonbrook West"                  "High Park North"                  
## [31] "High Park-Swansea"                 "North St.James Town"              
## [33] "Oakridge"                          "Rosedale-Moore Park"              
## [35] "Bathurst Manor"                    "Bendale"                          
## [37] "Downsview-Roding-CFB"              "Lambton Baby Point"               
## [39] "Bay Street Corridor"               "Willowdale East"                  
## [41] "Willowdale West"                   "Cabbagetown-South St.James Town"  
## [43] "East End-Danforth"                 "Playter Estates-Danforth"         
## [45] "Woburn"                            "Woodbine-Lumsden"                 
## [47] "Bayview Village"                   "Bedford Park-Nortown"             
## [49] "Bridle Path-Sunnybrook-York Mills" "Lawrence Park South"              
## [51] "Lawrence Park North"               "Yonge-Eglinton"
#(b)
neighborhoods_subway_crime <- st_join(neighborhoods_subway, crime_sites, join = st_intersects)
neighborhoods_wihtout_subway_crime <- st_join(neighborhoods_without_subway, crime_sites, join = st_intersects)

neighborhoods_subway_crime_table<-neighborhoods_subway_crime %>% as_tibble() %>% 
  group_by(Neighbourh) %>%
  summarise(total_crimes=n())

neighborhoods_without_subway_crime_table<-neighborhoods_wihtout_subway_crime %>% as_tibble() %>% 
  group_by(Neighbourh) %>%
  summarise(total_crimes=n())

print(mean(neighborhoods_subway_crime_table$total_crimes))
## [1] 34.40385
print(mean(neighborhoods_without_subway_crime_table$total_crimes))
## [1] 24.10227

We can see the average of crimes in the neighborhoods with subway is higher than the average of crimes in the neighborhoods without subway.

Part B.1

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ✔ readr     2.1.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(leaflet)
library(sf)
library(geoR)
## Warning: package 'geoR' was built under R version 4.3.2
## --------------------------------------------------------------
##  Analysis of Geostatistical Data
##  For an Introduction to geoR go to http://www.leg.ufpr.br/geoR
##  geoR version 1.9-4 (built on 2024-02-14) is now loaded
## --------------------------------------------------------------
library(fields)
## Warning: package 'fields' was built under R version 4.3.3
## Loading required package: spam
## Spam version 2.10-0 (2023-10-23) is loaded.
## Type 'help( Spam)' or 'demo( spam)' for a short introduction 
## and overview of this package.
## Help for individual functions is also obtained by adding the
## suffix '.spam' to the function name, e.g. 'help( chol.spam)'.
## 
## Attaching package: 'spam'
## 
## The following objects are masked from 'package:base':
## 
##     backsolve, forwardsolve
## 
## Loading required package: viridisLite
## 
## Try help(fields) to get started.
## 
## Attaching package: 'fields'
## 
## The following object is masked from 'package:leaflet':
## 
##     addLegend
library(maps)
## 
## Attaching package: 'maps'
## 
## The following object is masked from 'package:purrr':
## 
##     map
library(mapdata)
library(units)

Part B (1)(a)

# Set parameter values
h <- seq(1e-5, 15, length.out = 10000)

tau2 <- 0.5
sigma2 <- 4
phi <- 6
lambda <- 0.5

# Define the semivariogram functions
linear <- tau2 + sigma2 * h
power_model <- tau2 + sigma2 * (h**lambda)
rational_quadratic <- tau2 + sigma2 * (h**2 / (1 + phi * (h**2)))
powered_exponential <- tau2 + sigma2 * (1 - exp(-abs(phi * h)^lambda))

# Handle the Wave semivariogram
wave <- tau2 + sigma2 * (1 - sin(phi * h) / (phi * h))


# Plot the semivariograms
plot(h, linear, type = "l", col = "red", ylim = c(0, 10), ylab = "Semivariance", xlab = "Distance h")
lines(h, power_model, col = "blue")
lines(h, rational_quadratic, col = "green")
lines(h, powered_exponential, col = "purple")
lines(h, wave, col = "orange")
abline(h = sigma2+tau2, col = "black") 

legend("topright", legend = c("Linear", "Power model", "Rational quadratic", "Powered exponential", "Wave"), col = c("red", "blue", "green", "purple", "orange"), lty = 1, cex = 0.7)

Part B (1)(b)

1.Linear: The semivariance increases linearly, with the increase of distance. The value of semivariance exceed the sill as h(distance) increase to a certain points. 2.Power Model: The semivariance increases with a decreasing slope, with the increase of distance. Similarly, the value of semivariance exceed the sill as h(distance) increase to a certain points. 3.Rational Quadratic: Like power model, the semivariance increases with a decreasing slope. However, the maximum value of semivariance in rational Quadratic model is far smaller than the sill. Semivariance in rational Quadratic model can never reach the sill with the increase of h. 4.Powered Exponential: The semivariance increases with a decreasing slope, with the increase of distance. The value of semivariance approach closely to the sill, when h approaches closely to the range(phi). 5.Wave: At first, semivariance increases as h increase. Then after a certain point, the The value of semivariance oscillates around the sill. Then, as h increases further, the amplitude becomes smaller.

The Powered Exponential and wave model are stationary: As the semivariance approaches to the sill as h increases. This indicates that the spatial correlation between points becomes constant beyond a certain distance. Also it means covariance(C(h)) goes to 0 as h goes to infinity. So C(h) exists. So stationarity is satified.

The line model and power model are not stationary. The value of semivariance increase indefinitely as h(distance) increase.

The Rational Quadratic model is stationary. Because the semivariance becomes more and more flatter as h increases. However, the asymptote that semivariance can reach is smaller the real sill, meaning the variance beyond the range is smaller.

Part B (1)(c)

lambda_values <- c(0.5, 1, 2)


plot(h, tau2 + sigma2 * h^lambda_values[1], type = "l", col = "blue", ylim = c(0, 20),
     ylab = "Semivariance", xlab = "Distance h", main = "Power Models")
lines(h,tau2 + sigma2 * h^lambda_values[2], col = "purple", lwd = 1)
lines(h,tau2 + sigma2 * h^lambda_values[3], col = "darkgreen", lwd = 1)
legend("topright", legend = c("lambda = 0.5", "lambda = 1", "lambda = 2"), 
       col = c("blue", "purple", "darkgreen"), lty = 1, lwd = 2)

The parameter lambda_values represents the slope of the curves. In other words, λ controls how quickly semivariance increases as the distance increases. For example, (1)when lamda=1, the power model is the same as linear function. (2)When lamda > 1, the semivariance increases with increasing slope as h increase. The increase becomes steeper. The first derivative keeps increasing. (3)When lamda < 1, the semivariance increases with decreasing slop as h increase. The increase is more gradual. The first derivative keeps decreasing

#(c) continued
lambda_values <- c(0.5, 1, 2)


plot(h, tau2 + sigma2 * (1 - exp(-abs(phi * h)^lambda_values[1])), type = "l", col = "blue", ylim = c(0, 5),
     ylab = "Semivariance", xlab = "Distance h", main = "Powered Exponential Models")
lines(h,tau2 + sigma2 * (1 - exp(-abs(phi * h)^lambda_values[2])), col = "purple", lwd = 1)
lines(h,tau2 + sigma2 * (1 - exp(-abs(phi * h)^lambda_values[3])), col = "darkgreen", lwd = 1)
legend("topright", legend = c("lambda = 0.5", "lambda = 1", "lambda = 2"), 
       col = c("blue", "purple", "darkgreen"), lty = 1, lwd = 2)

The parameter lambda represents: how fast the semivariance move close to the sill. We can see: The larger the value of lambda is, the faster the semivariance approach to the sill. But no matter what the value of lamda is, the semivariance increases with increasing slope as h increase. The increase becomes steeper.

Part B (1)(d) ######### rational_quadratic #########

tau2_values <- c(0.5, 1, 2)
sigma2_values <- c(4,5,6)
phi_values <- c(6, 10, 14)

rational_quadratic <- tau2 + sigma2 * (h^2 / (1 + phi * h^2))

#rational_quadratic with changing tau

plot(h, tau2_values[1] + sigma2 * (h^2 / (1 + phi * h^2)), type = "l", col = "blue", ylim = c(0, 5),
     ylab = "Semivariance", xlab = "Distance h", main = "Rational quadratic with changing tau")
lines(h,tau2_values[2] + sigma2 * (h^2 / (1 + phi * h^2)), col = "purple", lwd = 1)
lines(h,tau2_values[3] + sigma2 * (h^2 / (1 + phi * h^2)), col = "darkgreen", lwd = 1)
legend("topright", legend = c("tau2 = 0.5", "tau2 = 1", "tau2 = 2"), 
       col = c("blue", "purple", "darkgreen"), lty = 1, lwd = 2)

tau(nugget) represent the discontinuity at the origin. The parameter tau can move the curves vertically. For the same h, the value of semivariance is larger when tau2 is larger. And, the value of tau does not affect the slope of changing of semivariance.

#rational_quadratic with changing sigma2

plot(h, tau2 + sigma2_values[1] * (h^2 / (1 + phi * h^2)), type = "l", col = "blue", ylim = c(0, 10),
     ylab = "Semivariance", xlab = "Distance h", main = "Rational quadratic with changing sigma2")
lines(h,tau2 + sigma2_values[2] * (h^2 / (1 + phi * h^2)), col = "purple", lwd = 1)
lines(h,tau2 + sigma2_values[3] * (h^2 / (1 + phi * h^2)), col = "darkgreen", lwd = 1)
legend("topright", legend = c("sigma2 = 4", "sigma2 = 5", "sigma2 = 6"), 
       col = c("blue", "purple", "darkgreen"), lty = 1, lwd = 2)

sigma2(sill) represent the asymptote where semivariance is reached. A larger sigma2 means semivariance can reach a higher value. A smaller sigma2 means semivariance can reach a smaller value. However,the distance where the semivariance stabilizes does not change with sigma2.

phi_values <- c(1, 6,20)
#rational_quadratic with changing phi

plot(h, tau2 + sigma2 * (h^2 / (1 + phi_values[1] * h^2)), type = "l", col = "blue", ylim = c(0, 10),
     ylab = "Semivariance", xlab = "Distance h", main = "Rational quadratic with changing phi")
lines(h,tau2 + sigma2 * (h^2 / (1 + phi_values[2] * h^2)), col = "purple", lwd = 1)
lines(h,tau2 + sigma2 * (h^2 / (1 + phi_values[3] * h^2)), col = "darkgreen", lwd = 1)
legend("topright", legend = c("phi = 1", "phi = 6", "phi = 20"), 
       col = c("blue", "purple", "darkgreen"), lty = 1, lwd = 2)

phi(range) represents the distance where the semivariance reached the asymptote. Beyond this distance, it is assumed that there is no autocorrelation any more. phi Controls how quickly the semivariogram reaches its sill. A smaller phi means the semivariogram reaches its sill faster. Interestingly, the value of asymptote(the highest possible value of semivariance) does change with phi in the case of rational quadratic.

wave

tau2_values <- c(0.5, 1, 2)
sigma2_values <- c(4,5,6)
phi_values <- c(6, 10, 14)

wave <- tau2 + sigma2 * (1 - sin(phi * h) / (phi * h))


#wave with changing tau

plot(h, tau2_values[1] + sigma2 * (1 - sin(phi * h) / (phi * h)), type = "l", col = "blue", ylim = c(0, 8),
     ylab = "Semivariance", xlab = "Distance h", main = "wave function with changing tau")
lines(h, tau2_values[2] + sigma2 * (1 - sin(phi * h) / (phi * h)), col = "purple", lwd = 1)
lines(h, tau2_values[3] + sigma2 * (1 - sin(phi * h) / (phi * h)), col = "darkgreen", lwd = 1)
legend("topright", legend = c("tau2 = 0.5", "tau2 = 1", "tau2 = 2"), 
       col = c("blue", "purple", "darkgreen"), lty = 1, lwd = 2)

Similary, tau(nugget) represent the discontinuity at the origin. The parameter tau can move the curves vertically. For the same h, the value of semivariance is larger when tau2 is larger. And, the value of tau does not affect the slope of changing of semivariance. And the shape of wave does not change

#wave with changing sigma2

plot(h, tau2 + sigma2_values[1] * (1 - sin(phi * h) / (phi * h)), type = "l", col = "blue", ylim = c(0, 8),
     ylab = "Semivariance", xlab = "Distance h", main = "wave function with changing sigma2")
lines(h, tau2 + sigma2_values[2] * (1 - sin(phi * h) / (phi * h)), col = "purple", lwd = 1)
lines(h, tau2 + sigma2_values[3] * (1 - sin(phi * h) / (phi * h)), col = "darkgreen", lwd = 1)
legend("topright", legend = c("sigma2 = 4", "sigma2 = 5", "sigma2 = 6"), 
       col = c("blue", "purple", "darkgreen"), lty = 1, lwd = 2)

Similarly, sigma2(sill) represent the asymptote where semivariance is reached. A larger sigma2 means semivariance can reach a higher value. A smaller sigma2 means semivariance can reach a smaller value. However,the distance where the semivariance stabilizes does not change with sigma2. And the shape of wave does not change

#wave with changing phi

plot(h, tau2 + sigma2 * (1 - sin(phi_values[1] * h) / (phi_values[1]* h)), type = "l", col = "blue", ylim = c(0, 8),
     ylab = "Semivariance", xlab = "Distance h", main = "wave function with changing phi")
lines(h, tau2 + sigma2 * (1 - sin(phi_values[2] * h) / (phi_values[2] * h)), col = "purple", lwd = 1)
lines(h, tau2 + sigma2 * (1 - sin(phi_values[3] * h) / (phi_values[3] * h)), col = "darkgreen", lwd = 1)
legend("topright", legend = c("phi = 6", "phi = 10", "phi = 14"), 
       col = c("blue", "purple", "darkgreen"), lty = 1, lwd = 2)

Similarly, phi(range) represents the distance where the semivariance reached the asymptote. phi Controls how quickly the semivariogram reaches its sill. We can see that the semivariogram of the wave function with a larger phi reaches its sill faster. Note that the value of asymptote does not change with phi.

Part B (2)

#I use the same parameter values as question(1)
exponential <- tau2+sigma2*(1-exp(-h/phi))
spherical <- tau2 +sigma2*(1.5*(h/phi)-0.5*((h/phi)**3))
gaussain <- tau2 +sigma2*(1-exp(-(h**2)/(phi*phi)))


#c(h)= c(0)-semivarigrarm
#c(0)= data variance = sill

cov_exponential = tau2+sigma2 -  exponential
cov_spherical = tau2+sigma2-  spherical
cov_gaussain =tau2+sigma2 -  gaussain

plot(h,cov_exponential,type = "l",ylab = "C(h)", ylim=c(0,10), xlab = "Distance h", main = "C(h) of exponential function")

plot(h,cov_spherical, type = "l",ylab = "C(h)", ylim=c(0,10),xlab = "Distance h", main = "C(h) of spherical function",)

plot(h,cov_gaussain, type = "l", ylab = "C(h)", ylim=c(0,10),xlab = "Distance h", main = "C(h) of gaussain function")

Part B (3)

# For the purpose of neatness, we use a function





matern_covariance <- function(h, kappa, phi, sigma2 = 1) {

    term1 <- (sigma2 / (gamma(kappa) * (2**(kappa - 1))))
    term2 <- (sqrt(2 * kappa) * (h / phi))**kappa
    term3 <- besselK((sqrt(2 * kappa) * (h / phi)), kappa)
    return(term1 * term2 * term3)
  
}
    

# Define parameter values
sigma2 <- 4  
phi <- 6     
h <- seq(0, 10, length.out = 100)  
k_values <- c(0.01, 0.1, 1, 5, 10)  




# Create an empty plot
plot(h, matern_covariance(h, k_values[1], sigma2, phi), type = "l", col = "blue", ylim = c(0, 6),
     ylab = "C(h)", xlab = "Distance h", main = "Matern Covariance with Changing k")
lines(h, matern_covariance(h, k_values[2], sigma2, phi), col = "red")
lines(h, matern_covariance(h, k_values[3], sigma2, phi), col = "green")
lines(h, matern_covariance(h, k_values[4], sigma2, phi), col = "orange")
lines(h, matern_covariance(h, k_values[5], sigma2, phi), col = "black")
lines(h, matern_covariance(h, 10000, sigma2, phi), col = "purple")

legend("topright", legend = paste("kappa =", k_values), col = c("blue", "red", "green","orange","brown","purple"), lty = 1)

As k decrease to 0, the lines becomes a flat line. This means C(h) decrease with a decreasing speed. The shape of line become more similar to C(h) of exponential function When k<1, the C(0) is not equal to sigma2. When k>1, the C(0) is equal to sigma2. As k increase, the lines becomes steeper, which means the C(h) decrease with a increasing speed.The shape of line become more similar to C(h) of gaussain function However, as k increase to infinity, the line vanish, maybe due to numeric instability in R.

Part B (4)

dorian <- read.csv(paste0(data.dir, "/dorian.csv")) 
head(dorian)
##      lat     lon     temp dew.point ceiling.ht wind.dir  wind.sp atm.press
## 1 32.483 -80.717 27.76667  21.13333      22000 315.0000 2.850000  1011.000
## 2 32.550 -80.450 25.48333  21.20000         NA 298.3333 1.083333  1010.500
## 3 32.782 -79.925 25.98333        NA         NA 295.0000 3.350000  1010.067
## 4 32.899 -80.041 25.65000  19.90000      22000 321.6667 4.566667  1010.150
## 5 32.900 -80.033 25.30000  19.70000      22000 320.0000 4.650000  1010.250
## 6 33.350 -79.183 24.88333  21.51667         NA 296.6667 4.700000  1007.833
##         rh
## 1 68.24173
## 2 77.99672
## 3       NA
## 4 71.31048
## 5 71.89816
## 6 81.84029
dorian = dorian %>% 
  st_as_sf(coords=c('lon','lat'), crs=4326, remove=FALSE) %>% #set datum
          st_transform(crs=32617) %>% #project to UTM 11 (meters)
          mutate(x = st_coordinates(.)[,1]/1000, y = st_coordinates(.)[,2]/1000) # extract x, y in km

head(dorian)
## Simple feature collection with 6 features and 11 fields
## Geometry type: POINT
## Dimension:     XY
## Bounding box:  xmin: 526589.8 ymin: 3594010 xmax: 669074.5 ymax: 3691563
## Projected CRS: WGS 84 / UTM zone 17N
##      lat     lon     temp dew.point ceiling.ht wind.dir  wind.sp atm.press
## 1 32.483 -80.717 27.76667  21.13333      22000 315.0000 2.850000  1011.000
## 2 32.550 -80.450 25.48333  21.20000         NA 298.3333 1.083333  1010.500
## 3 32.782 -79.925 25.98333        NA         NA 295.0000 3.350000  1010.067
## 4 32.899 -80.041 25.65000  19.90000      22000 321.6667 4.566667  1010.150
## 5 32.900 -80.033 25.30000  19.70000      22000 320.0000 4.650000  1010.250
## 6 33.350 -79.183 24.88333  21.51667         NA 296.6667 4.700000  1007.833
##         rh                 geometry        x        y
## 1 68.24173 POINT (526589.8 3594010) 526.5898 3594.010
## 2 77.99672 POINT (551638.3 3601535) 551.6383 3601.535
## 3       NA POINT (600670.7 3627631) 600.6707 3627.631
## 4 71.31048 POINT (589689.4 3640498) 589.6894 3640.498
## 5 71.89816 POINT (590436.7 3640616) 590.4367 3640.616
## 6 81.84029 POINT (669074.5 3691563) 669.0745 3691.563
dorian1 <- dorian %>% st_drop_geometry()
head(dorian1)
##      lat     lon     temp dew.point ceiling.ht wind.dir  wind.sp atm.press
## 1 32.483 -80.717 27.76667  21.13333      22000 315.0000 2.850000  1011.000
## 2 32.550 -80.450 25.48333  21.20000         NA 298.3333 1.083333  1010.500
## 3 32.782 -79.925 25.98333        NA         NA 295.0000 3.350000  1010.067
## 4 32.899 -80.041 25.65000  19.90000      22000 321.6667 4.566667  1010.150
## 5 32.900 -80.033 25.30000  19.70000      22000 320.0000 4.650000  1010.250
## 6 33.350 -79.183 24.88333  21.51667         NA 296.6667 4.700000  1007.833
##         rh        x        y
## 1 68.24173 526.5898 3594.010
## 2 77.99672 551.6383 3601.535
## 3       NA 600.6707 3627.631
## 4 71.31048 589.6894 3640.498
## 5 71.89816 590.4367 3640.616
## 6 81.84029 669.0745 3691.563
dorian1_wind<-as.geodata(dorian1,coords.col=c(10,11),data.col=7)
dorian1_atm<-as.geodata(dorian1,coords.col=c(10,11),data.col=8)

semi_var_wind<-variog(dorian1_wind,option="cloud")
## variog: computing omnidirectional variogram
semi_var_atm<-variog(dorian1_atm,option="cloud")
## variog: computing omnidirectional variogram
plot(semi_var_wind,xlab="Distance (h), km", main= 'semivariance of all pairs of point in terms of wind speed')

plot(semi_var_atm,xlab="Distance (h), km",main= 'semivariance of all pairs of point in terms of wind speed')

#empirical semi-variogram of wind speed
semi_var_wind_bin<-variog(dorian1_wind,uvec=seq(0,semi_var_wind$max.dist,l=20),option="bin")
## variog: computing omnidirectional variogram
plot(semi_var_wind_bin, xlab="Distance (h), km",main = "empirical semi-variogram bin plot of wind speed")

semi_var_wind_box<-variog(dorian1_wind,uvec=seq(0,semi_var_wind$max.dist,l=20),bin.cloud=T)
## variog: computing omnidirectional variogram
plot(semi_var_wind_box,bin.cloud=T,xlab="Bin", main = "empirical semi-variogram boxplot of wind speed")

#empirical semi-variogram of wind speed
semi_var_atm_bin<-variog(dorian1_atm,uvec=seq(0,semi_var_atm$max.dist,l=20),option="bin")
## variog: computing omnidirectional variogram
plot(semi_var_atm_bin, xlab="Distance (h), km", main = "empirical semi-variogram bin plot of atmosphere pressure")

semi_var_atm_box<-variog(dorian1_atm,uvec=seq(0,semi_var_atm$max.dist,l=20),bin.cloud=T)
## variog: computing omnidirectional variogram
plot(semi_var_atm_box,bin.cloud=T,xlab="Bin", main = "empirical semi-variogram boxplot of atmosphere pressure")